require(tidyverse)
require(flowCore)
require(flowClust)
require(openCyto)
require(ggcyto)
require(cowplot)
import the data from the RDSS (just once) and then write it to the local disk
Simplify the sample names
# import the longest substring function from the PTXQC package (https://rdrr.io/cran/PTXQC/man/LCSn.html)
source("../../script/20220326-simplify-names-subroutine.R")
shortNames <- simplifyNames(oriNames) %>%
gsub(".fcs","",.)
sampleNames(fs) <- shortNames
Metadata
sample <- read.csv("20220602-sample-list.csv") %>%
column_to_rownames(var = "file")
pData(fs) <- sample
Next we use a series of plots to guide our gating strategy for identifying the population we want to work with. ### Remove outliers We first gate on FSC.H and SSC.H to remove outliers (events that are too big or too small). The Attune instrument we use can record six decades (100-106), with the first two decades mostly occupied by electronic noise.
Let’s first define a gate and visualize it in a plot before adding it to a GatingSet.
outlier.gate <- rectangleGate(filterId = "-outlier", "FSC.H" = c(1.2e5, 1e6), "SSC.H" = c(1e2, 1e6))
ggcyto(fs, aes(x = FSC.H, y = SSC.H), subset = "root") +
geom_hex(bins = 64) + geom_gate(outlier.gate) + facet_wrap(~name, ncol = 7) + theme_bw()
Add this gate to the GatingSet
gs <- GatingSet(fs) # create a GatingSet
gs_pop_add(gs, outlier.gate, parent = "root")
[1] 2
recompute(gs)
done!
When I plotted the singlet events on GFP-RFP 2d space, I noticed a few samples that show more than one population of cells, where the main population appeared to be “induced” while one or more subpopulations are less or not induced. While the biological reasons behind require further investigation and may be very interesting (heterogeneity), for this analysis we will use flowClust to identify the main population and move forward.
ggcyto(gs, aes(x = BL1.H, y = YL2.H), subset = "-outlier") + geom_hex(bins = 64) +
facet_wrap(~name, ncol = 7) + scale_x_logicle() + scale_y_logicle() + theme_bw()
Be careful when working with the GatingSet and GatingHierarchy objects – these are strictly reference classes, meaning that most of the operations work by pointers and the operations will change the underlying data. For example, the first line of the code below (commented out) obtains a pointer to the underlying data rather than making a copy of that data. any operations on it will change the original data as a result.
#ex <- gs_pop_get_data(gs, "singlet")[[1]]
ex <- fs[["218-555-1"]]
#lgcl <- estimateLogicle(ex, channels = c("BL1.H", "YL2.H"))
lgcl <- logicleTransform("induction")
# set cluster gate parameters
k = 1; q = 0.9
# end setting
ex <- transform(ex, lgBL1.H = lgcl(`BL1.H`), lgYL2.H = lgcl(`YL2.H`))
fluo.gate <- gate_flowclust_2d(ex, "lgBL1.H", "lgYL2.H", K = k, quantile = q)
The prior specification has no effect when usePrior=no
Using the serial version of flowClust
ggcyto(ex, aes(x = lgBL1.H, y = lgYL2.H)) + geom_hex(bins = 64) + geom_gate(fluo.gate) + geom_stats()# + scale_x_logicle() + scale_y_logicle()
Even though flowClust is supposed to perform its own transformation
(modified Box-Cox), empirically I found the clustering seem to work
better on logicle transformed data for the two fluorescent channels.
Therefore I’m transforming the underlying data of the GatingSet. Note
that it seems to be difficult to “create new parameters” to store the
transformed data, while keeping the original data intact. Instead, the
transformation functions constructed using the constructor
logicle_trans() stores the inverse transformation
functions, which can be used to perform the inverse transformation when
needed. Followed the manual for GatingSet here
lgcl <- logicle_trans()
transList <- transformerList(c(lgBL1.H = "BL1.H", lgclYL2.H = "YL2.H"), lgcl)
transform(gs, transList)
A GatingSet with 40 samples
to obtain the original data, use
gs_pop_get_data(gs[[1]], inverse.transform = TRUE)
Now we can do the flowClust gating
dat <- gs_pop_get_data(gs, "-outlier") # get parent data
inductionGate <- fsApply(dat, function(fr)
openCyto::gate_flowclust_2d(fr, "BL1.H", "YL2.H", K = k, quantile = q)
)
gs_pop_add(gs, inductionGate, parent = "-outlier", name = "induction")
recompute(gs)
ggcyto(gs, aes(x = BL1.H, y = YL2.H), subset = "-outlier") + geom_hex(bins = 64) + geom_gate("induction") + geom_stats() +
facet_wrap(~name, ncol = 7) + theme_bw()
Several samples didn’t work well with the clustering algorithm. Here we will use a 2 cluster gating strategy to select the expressing population.
list.redo <- paste(rep(c("211", "218", "241"), each = 3), "555", 1:3, sep = "-")
newGate <- lapply(list.redo, function(x){
gate_flowclust_2d(dat[[x]], "BL1.H", "YL2.H", K = 2, quantile = 0.9, target = c(3,4))
})
The prior specification has no effect when usePrior=no
Using the serial version of flowClust
The prior specification has no effect when usePrior=no
Using the serial version of flowClust
The prior specification has no effect when usePrior=no
Using the serial version of flowClust
The prior specification has no effect when usePrior=no
Using the serial version of flowClust
The prior specification has no effect when usePrior=no
Using the serial version of flowClust
The prior specification has no effect when usePrior=no
Using the serial version of flowClust
The prior specification has no effect when usePrior=no
Using the serial version of flowClust
The prior specification has no effect when usePrior=no
Using the serial version of flowClust
The prior specification has no effect when usePrior=no
Using the serial version of flowClust
names(newGate) <- list.redo
ggcyto(gs[list.redo], aes(x = BL1.H, y = YL2.H), subset = "-outlier") + geom_hex(bins = 64) +
geom_gate(newGate) + theme_bw() #+ geom_stats()
update the inductionGate object
gs_pop_set_gate(gs[list.redo], "induction", newGate)
recompute(gs[list.redo], "induction")
Check the results
ggcyto(gs, aes(x = BL1.H, y = YL2.H), subset = "-outlier") +
geom_hex(bins = 64) + geom_gate("induction") + geom_stats() +
facet_wrap(~name, ncol = 7) + theme_bw()
Brian Metzger and colleagues proposed a simple correction in their 2015 paper. The intuition behind this method is that FSC is proportional to the max 2d projection (area) of a cell, and thus FSC^(3/2) should be roughly proportional to the volume. By contrast, the fluorescent channels should be the cumulative intensity from the whole cell. Therefore dividing FP intensity by FSC^(3/2) should effectively remove the variation due to cell size difference. He did say that the Wittkopp lab later switched to a different method based on PCA and rotations. After reading those papers, I thought the simpler one will suffice for us.
Test normalization formula
# get population data
fs.out <- gs_pop_get_data(gs, y = "induction", inverse.transform = TRUE) # get the inverse transformed data
# come up with an approximate FSC.H value for an average event to be used a scalar for the next step
mfsc <- 5e5 # based on the mode of the median of FSC.H from all samples
# fs.out is of the cytoframe class, which is a reference class. need to convert to flowframe for transformation
# https://www.bioconductor.org/packages/devel/bioc/vignettes/flowWorkspace/inst/doc/flowWorkspace-Introduction.html
exponent <- 1.5
# ex <- cytoframe_to_flowFrame(fs.out[["A9"]]) %>%
# transform(nFSC = FSC.H/mfsc) %>%
# transform(nGFP = BL1.H/nFSC^(exponent), nRFP = YL2.H/nFSC^(exponent))
ex <- cytoset_to_flowSet(fs.out) %>%
transform(nFSC = FSC.H/mfsc) %>%
transform(nGFP = BL1.H/nFSC^(exponent), nRFP = YL2.H/nFSC^(exponent))
plot the examples
ggplot(ex, aes(x = nFSC, y = nRFP/1e3)) + geom_hex(bins = 64) + scale_y_log10() + stat_smooth(method = "lm") +
scale_fill_gradientn(colours = rev(RColorBrewer::brewer.pal(11, "Spectral"))) + facet_wrap(~name, scales = "free_y") +
theme_bw()
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Removed 11 rows containing non-finite values (stat_binhex).
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 11 rows containing non-finite values (stat_smooth).
# calculate ratio
norm.data <- fsApply(fs.out, function(cf) {
cf <- cbind(cf,
nRFP = cf[,"YL2.H"] / (cf[, "FSC.H"]/mfsc)^(exponent),
nGFP = cf[,"BL1.H"] / (cf[, "FSC.H"]/mfsc)^(exponent))
apply(cf[, c("FSC.H", "BL1.H", "YL2.H", "nGFP", "nRFP")], 2, median)
}, use.exprs = TRUE) %>%
as_tibble(rownames = "name") %>%
mutate(across(BL1.H:nRFP, ~ round(.x, 1)))
The goal is to export the gated events and calculate the RFP/GFP and take the median, which will be used in downstream analyses.
Get the population stats
stats <- gs_pop_get_stats(gs) %>%
as_tibble() %>%
mutate(pop = gsub(".*/", "", pop), pop = gsub("-outlier", "cells", pop)) %>%
pivot_wider(names_from = pop, names_prefix = "n_", values_from = count)
Export the data
# pull all info together in a single tibble
final <- left_join(as_tibble(pData(fs)), stats, by = c("name" = "sample")) %>%
left_join(norm.data, by = "name")
write_tsv(final, "20220602-gated-median-out.txt")